{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "df = pd.read_csv('Orange_Telecom_Churn_Data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>state</th>\n",
       "      <th>account_length</th>\n",
       "      <th>area_code</th>\n",
       "      <th>phone_number</th>\n",
       "      <th>intl_plan</th>\n",
       "      <th>voice_mail_plan</th>\n",
       "      <th>number_vmail_messages</th>\n",
       "      <th>total_day_minutes</th>\n",
       "      <th>total_day_calls</th>\n",
       "      <th>total_day_charge</th>\n",
       "      <th>...</th>\n",
       "      <th>total_eve_calls</th>\n",
       "      <th>total_eve_charge</th>\n",
       "      <th>total_night_minutes</th>\n",
       "      <th>total_night_calls</th>\n",
       "      <th>total_night_charge</th>\n",
       "      <th>total_intl_minutes</th>\n",
       "      <th>total_intl_calls</th>\n",
       "      <th>total_intl_charge</th>\n",
       "      <th>number_customer_service_calls</th>\n",
       "      <th>churned</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>KS</td>\n",
       "      <td>128</td>\n",
       "      <td>415</td>\n",
       "      <td>382-4657</td>\n",
       "      <td>no</td>\n",
       "      <td>yes</td>\n",
       "      <td>25</td>\n",
       "      <td>265.1</td>\n",
       "      <td>110</td>\n",
       "      <td>45.07</td>\n",
       "      <td>...</td>\n",
       "      <td>99</td>\n",
       "      <td>16.78</td>\n",
       "      <td>244.7</td>\n",
       "      <td>91</td>\n",
       "      <td>11.01</td>\n",
       "      <td>10.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2.70</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>OH</td>\n",
       "      <td>107</td>\n",
       "      <td>415</td>\n",
       "      <td>371-7191</td>\n",
       "      <td>no</td>\n",
       "      <td>yes</td>\n",
       "      <td>26</td>\n",
       "      <td>161.6</td>\n",
       "      <td>123</td>\n",
       "      <td>27.47</td>\n",
       "      <td>...</td>\n",
       "      <td>103</td>\n",
       "      <td>16.62</td>\n",
       "      <td>254.4</td>\n",
       "      <td>103</td>\n",
       "      <td>11.45</td>\n",
       "      <td>13.7</td>\n",
       "      <td>3</td>\n",
       "      <td>3.70</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NJ</td>\n",
       "      <td>137</td>\n",
       "      <td>415</td>\n",
       "      <td>358-1921</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>0</td>\n",
       "      <td>243.4</td>\n",
       "      <td>114</td>\n",
       "      <td>41.38</td>\n",
       "      <td>...</td>\n",
       "      <td>110</td>\n",
       "      <td>10.30</td>\n",
       "      <td>162.6</td>\n",
       "      <td>104</td>\n",
       "      <td>7.32</td>\n",
       "      <td>12.2</td>\n",
       "      <td>5</td>\n",
       "      <td>3.29</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>OH</td>\n",
       "      <td>84</td>\n",
       "      <td>408</td>\n",
       "      <td>375-9999</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>0</td>\n",
       "      <td>299.4</td>\n",
       "      <td>71</td>\n",
       "      <td>50.90</td>\n",
       "      <td>...</td>\n",
       "      <td>88</td>\n",
       "      <td>5.26</td>\n",
       "      <td>196.9</td>\n",
       "      <td>89</td>\n",
       "      <td>8.86</td>\n",
       "      <td>6.6</td>\n",
       "      <td>7</td>\n",
       "      <td>1.78</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>OK</td>\n",
       "      <td>75</td>\n",
       "      <td>415</td>\n",
       "      <td>330-6626</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>0</td>\n",
       "      <td>166.7</td>\n",
       "      <td>113</td>\n",
       "      <td>28.34</td>\n",
       "      <td>...</td>\n",
       "      <td>122</td>\n",
       "      <td>12.61</td>\n",
       "      <td>186.9</td>\n",
       "      <td>121</td>\n",
       "      <td>8.41</td>\n",
       "      <td>10.1</td>\n",
       "      <td>3</td>\n",
       "      <td>2.73</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  state  account_length  area_code phone_number intl_plan voice_mail_plan  \\\n",
       "0    KS             128        415     382-4657        no             yes   \n",
       "1    OH             107        415     371-7191        no             yes   \n",
       "2    NJ             137        415     358-1921        no              no   \n",
       "3    OH              84        408     375-9999       yes              no   \n",
       "4    OK              75        415     330-6626       yes              no   \n",
       "\n",
       "   number_vmail_messages  total_day_minutes  total_day_calls  \\\n",
       "0                     25              265.1              110   \n",
       "1                     26              161.6              123   \n",
       "2                      0              243.4              114   \n",
       "3                      0              299.4               71   \n",
       "4                      0              166.7              113   \n",
       "\n",
       "   total_day_charge   ...     total_eve_calls  total_eve_charge  \\\n",
       "0             45.07   ...                  99             16.78   \n",
       "1             27.47   ...                 103             16.62   \n",
       "2             41.38   ...                 110             10.30   \n",
       "3             50.90   ...                  88              5.26   \n",
       "4             28.34   ...                 122             12.61   \n",
       "\n",
       "   total_night_minutes  total_night_calls  total_night_charge  \\\n",
       "0                244.7                 91               11.01   \n",
       "1                254.4                103               11.45   \n",
       "2                162.6                104                7.32   \n",
       "3                196.9                 89                8.86   \n",
       "4                186.9                121                8.41   \n",
       "\n",
       "   total_intl_minutes  total_intl_calls  total_intl_charge  \\\n",
       "0                10.0                 3               2.70   \n",
       "1                13.7                 3               3.70   \n",
       "2                12.2                 5               3.29   \n",
       "3                 6.6                 7               1.78   \n",
       "4                10.1                 3               2.73   \n",
       "\n",
       "   number_customer_service_calls  churned  \n",
       "0                              1    False  \n",
       "1                              1    False  \n",
       "2                              0    False  \n",
       "3                              2    False  \n",
       "4                              3    False  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head() #初步分析、判断哪些数据是无关的（不过有一些所谓的手机号具有数字能量的大仙-。-）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.drop(['state','area_code','phone_number'],axis =1, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.shape(df.values)[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelBinarizer \n",
    "#需要把标称型数据preprocessing.LabelBinarizer进行二值化处理，\n",
    "#比如把yes or no 转化为0\n",
    "lb = LabelBinarizer()\n",
    "for col in ['intl_plan','voice_mail_plan','churned']:\n",
    "    df[col] = lb.fit_transform(df[col])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler #标准化数据 [0,1]\n",
    "msc = MinMaxScaler()\n",
    "df = pd.DataFrame(msc.fit_transform(df),columns = df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_df = df.copy()\n",
    "Y_df = X_df.pop('churned')\n",
    "#or\n",
    "# x_cols = [x for x in df.columns if x !='churned']#获取不含分类标签的列\n",
    "# X_df = df[x_cols]\n",
    "# Y_df = df['churned']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "knn = KNeighborsClassifier(n_neighbors = 3)\n",
    "knn = knn.fit(X_df,Y_df)\n",
    "Y_predict = knn.predict(X_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def accuracy(real,predict):\n",
    "    return sum(Y_df == Y_predict)/float(real.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9422"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy(Y_df,Y_predict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "完整代码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9422"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.preprocessing import LabelBinarizer \n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "\n",
    "df = pd.read_csv('Orange_Telecom_Churn_Data.csv')\n",
    "df.drop(['state','area_code','phone_number'],axis =1, inplace = True)\n",
    "\n",
    "lb = LabelBinarizer()\n",
    "for col in ['intl_plan','voice_mail_plan','churned']:\n",
    "    df[col] = lb.fit_transform(df[col])\n",
    "    \n",
    "msc = MinMaxScaler()\n",
    "df = pd.DataFrame(msc.fit_transform(df),columns = df.columns)\n",
    "\n",
    "X_df = df.copy()\n",
    "Y_df = X_df.pop('churned')\n",
    "\n",
    "knn = KNeighborsClassifier(n_neighbors = 3)\n",
    "knn = knn.fit(X_df,Y_df)\n",
    "Y_predict = knn.predict(X_df)\n",
    "\n",
    "def accuracy(real,predict):\n",
    "    return sum(Y_df == Y_predict)/float(real.shape[0])\n",
    "\n",
    "accuracy(Y_df,Y_predict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "缺点：测试集同时也是训练集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
